{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import pickle\n",
    "import collections\n",
    "import numpy as np\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clearstring(string):\n",
    "    string = re.sub('[^\\'\\\"A-Za-z0-9 ]+', '', string)\n",
    "    string = string.split(' ')\n",
    "    string = filter(None, string)\n",
    "    string = [y.strip() for y in string]\n",
    "    string = [y for y in string if len(y) > 3 and y.find('nbsp') < 0]\n",
    "    return ' '.join(string)\n",
    "\n",
    "def read_data(location):\n",
    "    list_folder = os.listdir(location)\n",
    "    label = list_folder\n",
    "    label.sort()\n",
    "    outer_string, outer_label = [], []\n",
    "    for i in range(len(list_folder)):\n",
    "        list_file = os.listdir('data/' + list_folder[i])\n",
    "        strings = []\n",
    "        for x in range(len(list_file)):\n",
    "            with open('data/' + list_folder[i] + '/' + list_file[x], 'r') as fopen:\n",
    "                strings += fopen.read().split('\\n')\n",
    "        strings = list(filter(None, strings))\n",
    "        for k in range(len(strings)):\n",
    "            strings[k] = clearstring(strings[k])\n",
    "        labels = [i] * len(strings)\n",
    "        outer_string += strings\n",
    "        outer_label += labels\n",
    "    \n",
    "    dataset = np.array([outer_string, outer_label])\n",
    "    dataset = dataset.T\n",
    "    np.random.shuffle(dataset)\n",
    "    \n",
    "    string = []\n",
    "    for i in range(dataset.shape[0]):\n",
    "        string += dataset[i][0].split()\n",
    "    \n",
    "    return string\n",
    "\n",
    "def build_vocab(words, n_words):\n",
    "    count = [['UNK', -1]]\n",
    "    count.extend(collections.Counter(words).most_common(n_words - 1))\n",
    "    dictionary = dict()\n",
    "    for word, _ in count:\n",
    "        dictionary[word] = len(dictionary)\n",
    "    data = list()\n",
    "    unk_count = 0\n",
    "    for word in words:\n",
    "        index = dictionary.get(word, 0)\n",
    "        if index == 0:  # dictionary['UNK']\n",
    "            unk_count += 1\n",
    "        data.append(index)\n",
    "    count[0][1] = unk_count\n",
    "    reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n",
    "    return data, count, dictionary, reversed_dictionary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "strings = read_data('data')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['left', 'feeling', 'very', 'jealous', 'feel']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "strings[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "n_words = len(set(strings))\n",
    "_,_,dictionary,reversed_dictionary = build_vocab(strings,n_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('dataset-dictionary.p', 'wb') as fopen:\n",
    "    pickle.dump(reversed_dictionary, fopen)\n",
    "with open('dataset-dictionary-reverse.p', 'wb') as fopen:\n",
    "    pickle.dump(dictionary, fopen)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
